In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from datetime import datetime
import os

%matplotlib inline
%config InlineBackend.figure_format = 'png'
pd.set_option("max_columns",50)

In [3]:
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
train = train[train["is_booking"] == 1]
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)


Wall time: 43.3 s

In [4]:
use_col = ["srch_co","srch_ci","user_location_region",\
               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt","hotel_cluster"]

In [5]:
train_y = train[["hotel_cluster"]]

In [6]:
train_x = train[use_col]

In [8]:
train_x["srch_ci"] = pd.to_datetime(train_x["srch_ci"], errors="coerce")
train_x["srch_co"] = pd.to_datetime(train_x["srch_co"], errors="coerce")


C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [13]:
train_x["period"] = train_x["srch_co"] - train_x["srch_ci"]


C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [23]:
train_x["period"] = (train_x["period"] / np.timedelta64(1, 'D')).astype(int)


C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [26]:
train_x = train_x.drop(["srch_co","srch_ci"], axis=1)

In [29]:
train_x["srch_adults_cnt"].value_counts()


Out[29]:
2    27226
1    17543
3     2219
4     2132
6      334
5      269
0      113
8       92
7       54
9       18
Name: srch_adults_cnt, dtype: int64

In [34]:
train_x.tail()


Out[34]:
user_location_region user_location_region hotel_market srch_destination_id hotel_country srch_adults_cnt srch_children_cnt hotel_cluster period
49995 790 790 1447 8823 0 1 0 8 9
49996 448 448 696 8271 50 2 0 68 2
49997 403 403 438 11993 50 4 0 15 1
49998 824 824 1108 21816 63 2 1 98 2
49999 50 50 772 20001 105 2 1 22 2

In [36]:
train_x["num"] = 1


C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [41]:
pd.pivot_table(train_x, values='num', index=['srch_adults_cnt'], columns=['srch_children_cnt'], aggfunc=np.sum)


Out[41]:
srch_children_cnt 0 1 2 3 4 5 6 7 8 9
srch_adults_cnt
0 112.0 1.0 NaN NaN NaN NaN NaN NaN NaN NaN
1 13900.0 3338.0 251.0 50.0 1.0 1.0 1.0 1.0 NaN NaN
2 19629.0 5024.0 2253.0 263.0 51.0 2.0 3.0 NaN 1.0 NaN
3 1443.0 593.0 115.0 49.0 15.0 3.0 NaN NaN NaN 1.0
4 1438.0 420.0 171.0 55.0 42.0 3.0 1.0 1.0 1.0 NaN
5 191.0 45.0 21.0 9.0 2.0 1.0 NaN NaN NaN NaN
6 238.0 59.0 19.0 7.0 3.0 4.0 3.0 1.0 NaN NaN
7 38.0 11.0 3.0 NaN 1.0 1.0 NaN NaN NaN NaN
8 69.0 12.0 2.0 2.0 4.0 1.0 1.0 NaN 1.0 NaN
9 13.0 2.0 NaN NaN 2.0 NaN 1.0 NaN NaN NaN

In [43]:
train_x["srch_adults_cnt"] = train_x["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
# 3은 가족여행 나머지는 1(혼자) 2(이건 애매함)


C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [45]:
train_x = train_x.drop(["num","srch_children_cnt"], axis=1)

In [60]:
train_x.tail()


Out[60]:
hotel_market srch_destination_id hotel_country srch_adults_cnt hotel_cluster period
49995 1447 8823 0 1 8 9
49996 696 8271 50 2 68 2
49997 438 11993 50 3 15 1
49998 1108 21816 63 2 98 2
49999 772 20001 105 2 22 2

In [61]:
train_x["srch_destination_id"].value_counts()


Out[61]:
8250     1825
8267      945
12206     492
8253      470
8279      470
8745      384
8268      372
8230      364
8791      358
8260      313
8254      302
8291      294
7635      287
8788      273
8223      266
8278      262
8746      260
8220      258
8242      251
8819      237
669       210
468       209
8747      204
26022     199
8287      198
8213      195
8288      192
12603     189
8266      185
8282      183
         ... 
26333       1
26365       1
44804       1
28476       1
18009       1
42821       1
1905        1
22395       1
44932       1
24474       1
34721       1
1681        1
7762        1
5395        1
23962       1
15670       1
13655       1
23898       1
42341       1
23930       1
5523        1
26013       1
30239       1
3536        1
24026       1
22011       1
34305       1
5651        1
26141       1
12294       1
Name: srch_destination_id, dtype: int64

In [65]:
train_x["num"] = 1

In [70]:
pd.pivot_table(train_x, values='num', index=['srch_adults_cnt'], columns=['period'], aggfunc=np.sum)


Out[70]:
period 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 23 24 25 26 27 28
srch_adults_cnt
0 14.0 10.0 18.0 30.0 9.0 8.0 15.0 1.0 2.0 NaN 2.0 NaN NaN 3.0 NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 8045.0 3892.0 2432.0 1387.0 742.0 384.0 292.0 102.0 58.0 49.0 41.0 24.0 26.0 22.0 9.0 4.0 4.0 6.0 5.0 5.0 5.0 1.0 1.0 NaN 3.0 1.0 3.0
2 11045.0 5950.0 4092.0 2509.0 1400.0 776.0 762.0 233.0 138.0 116.0 48.0 37.0 31.0 42.0 12.0 3.0 5.0 3.0 5.0 6.0 6.0 1.0 NaN 1.0 NaN 1.0 4.0
3 2030.0 1178.0 819.0 493.0 224.0 121.0 140.0 37.0 26.0 19.0 7.0 9.0 2.0 5.0 2.0 3.0 2.0 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN

In [75]:
sns.heatmap(pd.pivot_table(train_x, values='num', index=['srch_adults_cnt'], columns=['period'], aggfunc=np.sum))


Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x188ed60f0>

In [77]:
train_x = train_x.drop(["num"], axis=1)

In [93]:
train_x = train_x[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]

In [91]:
train_y.head()


Out[91]:
hotel_cluster
0 15
1 72
2 58
3 56
4 42

In [97]:
%%time
use_col = ["srch_co","srch_ci","user_location_region",\
               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]

test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)

test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]

print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))

model.fit(train,train_y)

preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))

print("save file")

result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)


read the test.csv
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-97-7e39be7b8c21> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u'use_col = ["srch_co","srch_ci","user_location_region",\\\n               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]\nprint("read the test.csv")\ntest = pd.read_csv("../data/test.csv")\ntest = test[use_col]\n\ntest["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")\ntest["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")\ntest["period"] = test["srch_co"] - test["srch_ci"]\ntest["period"] = (test["period"] / np.timedelta64(1, \'D\')).astype(int)\ntest = test.drop(["srch_co","srch_ci"], axis=1)\ntest["num"] = 1\ntest["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)\ntest = test.drop(["num","srch_children_cnt"], axis=1)\n\ntest = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]\n\nprint("modeling strart")\nmodel = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)\nprint(\'=\'*50)\nprint(\'# Test shape : {}\'.format(test.shape))\n\nmodel.fit(train,train_y)\n\npreds = model.predict_proba(test)\npreds = np.fliplr(np.argsort(preds, axis=1))\n\nprint("save file")\n\nresult_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])\nresult_df.index.names = ["id"]\nfile_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + \'.csv\'\nresult_df.to_csv(os.path.join(\'../output\',file_name), index=True)')

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\IPython\core\interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2113             magic_arg_s = self.var_expand(line, stack_depth)
   2114             with self.builtin_trap:
-> 2115                 result = fn(magic_arg_s, cell)
   2116             return result
   2117 

<decorator-gen-60> in time(self, line, cell, local_ns)

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\IPython\core\magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\IPython\core\magics\execution.pyc in time(self, line, cell, local_ns)
   1178         else:
   1179             st = clock2()
-> 1180             exec(code, glob, local_ns)
   1181             end = clock2()
   1182             out = None

<timed exec> in <module>()

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\generic.pyc in astype(self, dtype, copy, raise_on_error, **kwargs)
   2948 
   2949         mgr = self._data.astype(dtype=dtype, copy=copy,
-> 2950                                 raise_on_error=raise_on_error, **kwargs)
   2951         return self._constructor(mgr).__finalize__(self)
   2952 

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\internals.pyc in astype(self, dtype, **kwargs)
   2936 
   2937     def astype(self, dtype, **kwargs):
-> 2938         return self.apply('astype', dtype=dtype, **kwargs)
   2939 
   2940     def convert(self, **kwargs):

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\internals.pyc in apply(self, f, axes, filter, do_integrity_check, consolidate, raw, **kwargs)
   2888 
   2889             kwargs['mgr'] = self
-> 2890             applied = getattr(b, f)(**kwargs)
   2891             result_blocks = _extend_blocks(applied, result_blocks)
   2892 

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\internals.pyc in astype(self, dtype, copy, raise_on_error, values, **kwargs)
    432                **kwargs):
    433         return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
--> 434                             values=values, **kwargs)
    435 
    436     def _astype(self, dtype, copy=False, raise_on_error=True, values=None,

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\internals.pyc in _astype(self, dtype, copy, raise_on_error, values, klass, mgr, **kwargs)
    475 
    476                 # _astype_nansafe works fine with 1-d only
--> 477                 values = com._astype_nansafe(values.ravel(), dtype, copy=True)
    478                 values = values.reshape(self.shape)
    479 

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\common.pyc in _astype_nansafe(arr, dtype, copy)
   1912 
   1913         if np.isnan(arr).any():
-> 1914             raise ValueError('Cannot convert NA to integer')
   1915     elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
   1916         # work around NumPy brokenness, #1987

ValueError: Cannot convert NA to integer

In [96]:
test


Out[96]:
srch_co srch_ci user_location_region hotel_market srch_destination_id hotel_country srch_adults_cnt srch_children_cnt period
0 2016-05-23 2016-05-19 174 27 12243 204 2 0 4 days
1 2016-05-15 2016-05-12 174 1540 14474 204 2 0 3 days
2 2015-07-27 2015-07-26 142 699 11353 50 4 0 1 days
3 2015-09-16 2015-09-14 258 628 8250 50 2 0 2 days
4 2015-07-23 2015-07-22 467 538 11812 50 2 0 1 days
5 2015-07-24 2015-07-22 311 447 11827 50 4 0 2 days
6 2015-08-03 2015-08-02 311 696 8271 50 2 0 1 days
7 2015-08-04 2015-08-03 348 191 8291 50 2 0 1 days
8 2015-12-31 2015-12-30 311 628 8250 50 2 0 1 days
9 2016-01-03 2016-01-02 311 364 9145 50 2 0 1 days
10 2016-01-06 2016-01-05 311 1230 12267 50 2 0 1 days
11 2015-08-21 2015-08-18 120 312 20813 70 2 1 3 days
12 2015-06-08 2015-06-05 342 699 11353 50 2 0 3 days
13 2015-06-26 2015-06-25 342 1501 8826 47 2 0 1 days
14 2015-08-04 2015-08-03 342 371 8228 198 2 0 1 days
15 2015-08-09 2015-08-04 342 789 1886 198 2 0 5 days
16 2015-08-19 2015-08-16 342 682 8268 50 2 1 3 days
17 2015-08-09 2015-08-01 342 686 12008 50 3 1 8 days
18 2015-08-20 2015-08-19 342 409 8243 50 2 0 1 days
19 2015-08-31 2015-08-26 342 698 8809 50 2 0 5 days
20 2015-09-03 2015-09-01 342 688 8219 50 2 0 2 days
21 2015-09-13 2015-09-11 342 701 8260 50 2 0 2 days
22 2015-11-29 2015-11-22 342 1511 8826 47 2 0 7 days
23 2015-10-18 2015-10-16 342 1502 1152 47 2 0 2 days
24 2016-02-01 2016-01-28 342 675 8267 50 2 0 4 days
25 2015-11-29 2015-11-25 174 675 8267 50 1 0 4 days
26 2015-08-24 2015-08-23 135 43 8217 34 2 0 1 days
27 2015-06-14 2015-06-12 174 368 4406 50 2 0 2 days
28 2015-12-22 2015-12-21 435 856 26437 50 2 0 1 days
29 2015-05-04 2015-05-01 226 110 8791 8 2 0 3 days
... ... ... ... ... ... ... ... ... ...
2528213 2015-06-30 2015-06-29 258 824 17363 50 3 0 1 days
2528214 2015-08-18 2015-08-16 258 1367 26624 50 2 0 2 days
2528215 2015-07-27 2015-07-26 258 1253 43731 50 2 0 1 days
2528216 2015-08-15 2015-08-14 258 426 27578 50 2 0 1 days
2528217 2015-08-09 2015-08-08 258 1056 54920 50 2 0 1 days
2528218 2015-10-29 2015-10-24 258 61 8242 171 2 0 5 days
2528219 2015-10-02 2015-10-01 258 895 27006 50 2 0 1 days
2528220 2015-10-03 2015-10-02 258 681 26046 50 2 0 1 days
2528221 2015-10-02 2015-10-01 258 1519 26501 50 1 0 1 days
2528222 2015-11-11 2015-11-10 258 1042 30323 50 2 0 1 days
2528223 2015-11-09 2015-11-08 258 1241 24692 50 1 0 1 days
2528224 2015-06-07 2015-06-05 331 642 11321 50 2 1 2 days
2528225 2015-09-09 2015-09-06 220 29 8746 105 2 1 3 days
2528226 2015-12-27 2015-12-26 324 406 12192 50 2 0 1 days
2528227 2015-12-26 2015-12-25 324 634 11921 50 2 0 1 days
2528228 2015-05-14 2015-05-12 196 2060 8222 119 2 0 2 days
2528229 2015-09-03 2015-09-02 970 2063 48884 119 1 0 1 days
2528230 2015-11-05 2015-11-04 970 2062 330 119 2 0 1 days
2528231 2015-11-17 2015-11-15 442 408 26956 50 2 0 2 days
2528232 2016-01-02 2015-12-31 337 628 8250 50 2 0 2 days
2528233 2015-08-23 2015-08-21 226 350 12177 50 2 0 2 days
2528234 2015-03-22 2015-03-21 354 1547 15246 146 2 2 1 days
2528235 2015-06-13 2015-06-12 258 130 38475 8 2 1 1 days
2528236 2015-09-20 2015-09-19 322 1052 4383 50 2 0 1 days
2528237 2015-07-27 2015-07-23 174 213 8855 50 2 1 4 days
2528238 2015-07-30 2015-07-27 174 214 8857 50 2 1 3 days
2528239 2015-08-05 2015-07-29 354 1749 19308 206 2 0 7 days
2528240 2015-05-19 2015-05-18 442 628 8250 50 1 0 1 days
2528241 2015-07-25 2015-07-24 184 905 4627 50 2 0 1 days
2528242 2015-02-27 2015-02-26 249 1490 38774 162 2 0 1 days

2528243 rows × 9 columns


In [140]:
%%time
use_col = ["srch_co","srch_ci","user_location_region",\
               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]

test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)

test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]

print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))

model.fit(train_x,train_y)

preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))

print("save file")

result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)


read the test.csv
modeling strart
==================================================
# Test shape : (2528243, 6)
C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:22: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
save file
Wall time: 12min 1s

public score : 0.16293


In [141]:
result_df


Out[141]:
hotel_cluster
id
0 25 64 5 46 2
1 82 62 67 30 61
2 91 42 48 18 16
3 1 79 45 54 24
4 91 48 42 95 1
5 91 48 42 95 1
6 91 18 95 42 48
7 91 18 95 48 59
8 1 79 45 54 24
9 91 48 95 42 18
10 91 98 18 95 70
11 25 59 46 82 14
12 91 56 98 41 70
13 91 48 42 58 6
14 95 91 55 37 48
15 99 43 42 28 91
16 91 56 98 41 70
17 91 98 56 48 41
18 91 48 95 18 59
19 91 98 56 48 41
20 56 91 41 95 70
21 56 91 41 70 98
22 65 52 66 31 34
23 65 48 91 42 62
24 56 98 41 70 55
... ...
2528218 46 64 59 29 2
2528219 91 48 42 7 43
2528220 91 48 42 50 28
2528221 91 48 42 7 32
2528222 91 48 42 7 43
2528223 91 48 42 7 32
2528224 91 95 18 41 42
2528225 64 22 99 46 9
2528226 91 48 95 42 18
2528227 91 1 95 18 48
2528228 82 67 36 46 64
2528229 67 82 36 61 62
2528230 82 67 46 61 36
2528231 91 48 42 28 13
2528232 1 79 45 54 24
2528233 91 48 95 42 18
2528234 82 67 62 36 61
2528235 43 5 6 29 4
2528236 91 48 42 47 50
2528237 26 0 73 84 91
2528238 26 0 91 1 48
2528239 82 43 30 78 67
2528240 1 79 54 45 24
2528241 91 48 42 47 50
2528242 67 46 29 36 82

2528243 rows × 1 columns

안되겠다 트리50개로 해봐야지

csv로 뽑고 => h2o.ai에서 진행


In [ ]:
%%time
use_col = ["srch_co","srch_ci","user_location_region",\
               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]

test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)

test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]

print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))

model.fit(train_x,train_y)

preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))

print("save file")

result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)

In [148]:
train_sum = pd.concat([train_x,train_y], axis=1)

In [149]:
train_sum.to_csv("train_data.csv")
test.to_csv("test_data.csv")

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [1]:
# 하나로 정리

In [ ]:
%%time

print('preprocessing train_data')
use_col = ["srch_co","srch_ci","user_location_region",\
               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt","hotel_cluster"]

train_y = train[["hotel_cluster"]]

train_x = train[use_col]
train_x["srch_ci"] = pd.to_datetime(train_x["srch_ci"], errors="coerce")
train_x["srch_co"] = pd.to_datetime(train_x["srch_co"], errors="coerce")
train_x["period"] = train_x["srch_co"] - train_x["srch_ci"]
train_x["period"] = (train_x["period"] / np.timedelta64(1, 'D')).astype(int)
train_x = train_x.drop(["srch_co","srch_ci"], axis=1)
train_x["srch_adults_cnt"] = train_x["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
train_x = train_x.drop(["srch_children_cnt"], axis=1)
train_x = train_x[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]



use_col = ["srch_co","srch_ci","user_location_region",\
               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]

print("preprocessing test_data")

test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)

test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]

print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))

model.fit(train_x,train_y)

preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))

print("save file")

result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)

In [8]:
train[["srch_children_cnt","srch_adults_cnt"]].groupby(["srch_children_cnt"]).agg


Out[8]:
srch_adults_cnt
srch_children_cnt
0 66557
1 17615
2 6042
3 1046
4 401
5 70
6 46
7 11
8 14
9 3

In [10]:
from csv import DictReader
from collections import defaultdict
from datetime import datetime

start = datetime.now()

def get_top5(d):
    return " ".join(sorted(d, key=d.get, reverse=True)[:5])

destination_clusters = defaultdict(lambda: defaultdict(int))

for i, row in enumerate(DictReader(open("../data/train.csv"))):
	destination_clusters[row["srch_destination_id"]][row["hotel_cluster"]] += 1
	if i % 1000000 == 0:
		print("%s\t%s"%(i, datetime.now() - start))

most_frequent = defaultdict(str)

for k in destination_clusters:
	most_frequent[k] = get_top5(destination_clusters[k])

with open("pred_sub.csv", "w") as outfile:
	outfile.write("id,hotel_cluster\n")
	for i, row in enumerate(DictReader(open("../data/test.csv"))):
		outfile.write("%d,%s\n"%(i,most_frequent[row["srch_destination_id"]]))
		if i % 1000000 == 0:
			print("%s\t%s"%(i, datetime.now() - start))


0	0:00:00.032000
1000000	0:00:08.536000
2000000	0:00:16.857000
3000000	0:00:25.250000
4000000	0:00:33.751000
5000000	0:00:42.339000
6000000	0:00:50.776000
7000000	0:00:59.239000
8000000	0:01:07.823000
9000000	0:01:16.249000
10000000	0:01:24.588000
11000000	0:01:32.984000
12000000	0:01:41.549000
13000000	0:01:50.041000
14000000	0:01:58.427000
15000000	0:02:06.812000
16000000	0:02:15.193000
17000000	0:02:23.633000
18000000	0:02:31.937000
19000000	0:02:40.275000
20000000	0:02:48.656000
21000000	0:02:57.063000
22000000	0:03:05.426000
23000000	0:03:15.156000
24000000	0:03:23.927000
25000000	0:03:32.309000
26000000	0:03:41.202000
27000000	0:03:49.622000
28000000	0:03:58.008000
29000000	0:04:06.977000
30000000	0:04:15.666000
31000000	0:04:24.836000
32000000	0:04:34.716000
33000000	0:04:43.532000
34000000	0:04:52.305000
35000000	0:05:00.588000
36000000	0:05:08.876000
37000000	0:05:17.220000
0	0:05:23.496000
1000000	0:05:33.737000
2000000	0:05:44.842000

In [ ]: